#! /usr/bin/python3

import pandas as pd
import numpy as np
import itertools
import pickle
import random, re, math
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, brier_score_loss, log_loss, f1_score, accuracy_score, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from scipy import sparse

def showTopSVM(classifier, vectorizer, n):
    feature_names = np.asarray(vectorizer.get_feature_names())
    temp_pos = np.argsort(classifier.coef_.todense())[:,-n:].tolist()[0]
    temp_neg = np.argsort(classifier.coef_.todense())[:,0:n].tolist()[0]
    print("Uncivil features:\n")
    for t in temp_pos:
        print("%s" % (feature_names[t]))
    print("\n\nCivil features:\n")
    for t in temp_neg:
        print("%s" % (feature_names[t]))

def showTopKBest(classifier, vectorizer, kbest, n):
    feature_names = np.asarray(vectorizer.get_feature_names())[kbest.get_support()]
    temp_pos = np.argsort(classifier.coef_.todense())[:,-n:].tolist()[0]
    temp_neg = np.argsort(classifier.coef_.todense())[:,0:n].tolist()[0]
    print("Uncivil features:\n")
    for t in temp_pos:
        print("%s" % (feature_names[t]))
    print("\n\nCivil features:\n")
    for t in temp_neg:
        print("%s" % (feature_names[t]))

if __name__=="__main__":

    filename = 'usa_training.csv'
    s = pd.read_table(filename, sep=',', header=0, encoding='utf-8')
    with open('usa_names.csv') as f:
        usa_names = f.read().splitlines()

    text = s.text.tolist()
    y = np.array(s.code.tolist())
    sent = np.array(s.sentiment.tolist()).reshape(-1, 1)
    maxscores = np.array(s.maxscore.tolist()).reshape(-1, 1)
    vectorizer = TfidfVectorizer(stop_words=usa_names, ngram_range=(1,2), max_features=5000)
    X = vectorizer.fit_transform(text)

    clfs = [
           ('SVM', SVC(kernel='linear', C=100, probability=True)),
           ('Balanced Bagging SVM',BalancedBaggingClassifier(SVC(kernel='linear', C=100, probability=True),
                                                             random_state=0, n_estimators=50,
                                                           replacement=True, ratio='all')),
           ('Decision Tree with Bagging', BaggingClassifier(random_state=0, n_estimators=50)),
           ('Logistic Regression', LogisticRegression(penalty='l1'))
           ]

    for name, clf in clfs:
        print("Classifier is: %s." %name)
        svm = clf
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        folds = 0
        accs = []; f1s = []; aucs = []; briers=[]; properr = [];
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            max_train, max_test = maxscores[train_index], maxscores[test_index]
            sent_train, sent_test = sent[train_index], sent[test_index]
            kbest = SelectKBest(chi2, k=2000)
            X_train = kbest.fit_transform(X_train, y_train)
            X_train = sparse.hstack((X_train, max_train, sent_train))
            X_train = sparse.csr_matrix(X_train)
            X_test = kbest.transform(X_test)
            X_test = sparse.hstack((X_test, max_test, sent_test))
            X_test = sparse.csr_matrix(X_test)
            svm.fit(X_train, y_train)
            yhat = svm.predict(X_test)
            phat = svm.predict_proba(X_test)[:,1]
            acc = accuracy_score(y_test, yhat)
            f1 = f1_score(y_test, yhat)
            auc = roc_auc_score(y_test, yhat)
            bri = brier_score_loss(y_test, phat)
            folds += 1
            accs.append(acc)
            f1s.append(f1)
            aucs.append(auc)
            briers.append(bri)
            properr.append(abs(np.mean(y_test)-np.mean(yhat)))
        print("\nAverage accuracy:")
        print("The percent correctly predicted is %0.2f%%" % (np.mean(accs)*100))
        print("The F1 score is %0.3f." % np.mean(f1s))
        print("The area under the ROC curve is %0.3f." % np.mean(aucs))
        print("The Brier loss score is %0.3f." % np.mean(briers))
        print("The error in proportion prediction is %0.3f." % np.mean(properr))
        print()

    # Fitting final model.
    kbest = SelectKBest(chi2, k=2000)
    X = kbest.fit_transform(X, y)
    X = sparse.hstack((X, maxscores, sent))
    X = sparse.csr_matrix(X)
    svm = BalancedBaggingClassifier(SVC(kernel='linear', C=100, probability=True),
                                                         random_state=0, n_estimators=50,
                                                          replacement=True, ratio='all')
    svm.fit(X, y)

    # Saving model.
    #with open('usa_balancedbagging.pkl', 'wb') as fout:
        #pickle.dump((vectorizer, kbest, svm), fout)

